Data Cleaning

# data cleaning
qb_stats <- qb_stats %>%
    select(-Position)
    
qb_stats <- qb_stats %>%
    rename("Rating" = "Passer Rating")

basic_stats <- basic_stats %>%
    rename("Team" = "Current Team")

basic_stats <- basic_stats %>%
    rename("hsloc" = "High School Location")

basic_stats <- basic_stats %>%
    rename("Weight" = "Weight (lbs)")

basic_stats <- basic_stats %>%
    rename("Height" = "Height (inches)")

basic_stats <- basic_stats %>%
    filter(Position == "QB")

qb_comb <- merge(qb_stats, basic_stats, by="Name")


qb_comb$`Passes Attempted`[which(qb_comb$`Passes Attempted` == '--')] = 0
qb_comb <- qb_comb %>%
  mutate(`Passes Attempted` = as.numeric(`Passes Attempted`)) %>% 
    mutate(`Games Played` = as.numeric(`Games Played`))

qb_comb <- qb_comb %>% select(-c(Number, `Years Played`))


qb_comb <- qb_comb %>% mutate(Experience = as.numeric(stringr::str_extract(Experience,'[0-9]+')))

qb_comb <- qb_comb %>% mutate(hsState = stringr::str_sub(hsloc,-2,-1))

qb_comb <- qb_comb %>% mutate(Conference = c("PAC-12", "PAC-12", "PAC-12", "PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12",
                                             "PAC-12", "PAC-12", "PAC-12", "PAC-12",
                                             "AAC", "AAC", "AAC",
                                             "Big 12", 
                                             "Big 12","Big 12","Big 12","Big 12","Big 12","Big 12",
                                             "Big 10", "Big 10", "Big 10", "Big 10", "Big 10", "Big 10", "Big 10", "Big 10", "Big 10", "Big 10", "Big 10", "Big 10", "Big 10", "Big 10", "Big 10", "Big 10",
                                             "Big 10", "Big 10", "Big 10", "Big 10", "Big 10", "Big 10", "Big 10", "Big 10", "Big 10", "Big 10", "Big 10", "Big 10", "Big 10", "Big 10", "Big 10", "Big 10", "Big 10",
                                             "ACC", "ACC",
                                             "ACC",
                                             "Mountain West", "Mountain West", "Mountain West",
                                             "PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12",
                                             "PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12",
                                             "Big 10",
                                             "Big 10","Big 10","Big 10","Big 10","Big 10",
                                             "SEC", "SEC", "SEC", "SEC", "SEC", "SEC", "SEC", "SEC", "SEC", "SEC", "SEC",
                                             "Big 12", "Big 12", "Big 12", "Big 12", "Big 12", "Big 12",
                                             "Big 12","Big 12","Big 12","Big 12","Big 12","Big 12","Big 12","Big 12",
                                             "CUSA", "CUSA", "CUSA", "CUSA", "CUSA",
                                             "Mountain West", "Mountain West", "Mountain West",
                                             "Ivy","Ivy","Ivy","Ivy","Ivy","Ivy","Ivy","Ivy","Ivy","Ivy","Ivy","Ivy",
                                             "Colonial Athletic Association","Colonial Athletic Association","Colonial Athletic Association","Colonial Athletic Association","Colonial Athletic Association","Colonial Athletic Association","Colonial Athletic Association","Colonial Athletic Association","Colonial Athletic Association",
                                             "PAC-12","PAC-12","PAC-12","PAC-12","PAC-12",
                                             "Big 12","Big 12","Big 12","Big 12","Big 12","Big 12",
                                             "Ohio Valley Conference", "Ohio Valley Conference", "Ohio Valley Conference",
                                             "ACC", "ACC", "ACC", "ACC",
                                             "PAC-12",
                                             "Big 12","Big 12","Big 12","Big 12","Big 12",
                                             "Big 10","Big 10","Big 10","Big 10","Big 10","Big 10","Big 10","Big 10","Big 10",
                                             "ACC", "ACC","ACC","ACC","ACC","ACC","ACC","ACC","ACC","ACC","ACC","ACC","ACC","ACC","ACC",
                                             "PAC-12",
                                             "Big 10","Big 10","Big 10","Big 10","Big 10","Big 10","Big 10","Big 10","Big 10",
                                             "PAC-12","PAC-12",
                                             "Pioneer Football League", "Pioneer Football League", "Pioneer Football League", "Pioneer Football League", "Pioneer Football League", "Pioneer Football League", "Pioneer Football League", "Pioneer Football League", "Pioneer Football League", "Pioneer Football League",
                                             "Big 10",
                                             "Big 12","Big 12","Big 12","Big 12",
                                             "Mountain West","Mountain West","Mountain West","Mountain West","Mountain West","Mountain West",
                                             "AAC","AAC","AAC","AAC","AAC",
                                             "PAC-12",
                                             "ACC","ACC","ACC","ACC","ACC","ACC",
                                             "PAC-12","PAC-12","PAC-12","PAC-12","PAC-12",
                                             "AAC",
                                             "SEC","SEC","SEC","SEC","SEC","SEC","SEC",
                                             "SEC","SEC","SEC","SEC","SEC","SEC","SEC","SEC","SEC","SEC","SEC","SEC","SEC",
                                             "PAC-12","PAC-12",
                                             "ACC", "ACC", "ACC", "ACC",
                                             "PAC-12","PAC-12",
                                             "SEC", "SEC", "SEC",
                                             "Southland Conference", "Southland Conference","Southland Conference","Southland Conference","Southland Conference","Southland Conference","Southland Conference","Southland Conference","Southland Conference","Southland Conference","Southland Conference","Southland Conference","Southland Conference","Southland Conference",
                                             "CUSA", "CUSA", "CUSA", "CUSA","CUSA","CUSA","CUSA","CUSA","CUSA","CUSA","CUSA","CUSA","CUSA", "Big 12",
                                             "Big 12","Big 12","Big 12","Big 12","Big 12","Big 12", "Big 10",
                                             "Big 10","Big 10","Big 10","Mountain West",
                                              "Mountain West", "Mountain West", "Mountain West", "PAC-12",
                                             "PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","ACC",
                                             "ACC", "ACC", "ACC", "SEC",
                                             "SEC","SEC","SEC","SEC","SEC","AAC",
                                             "AAC", "AAC","AAC","AAC","AAC","AAC","AAC","AAC","AAC","AAC","AAC","PAC-12",
                                             "PAC-12","PAC-12","PAC-12","PAC-12","PAC-12",
                                             "PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","Big 12",
                                             "Big 12", "ACC",
                                             "ACC","ACC","ACC","ACC","ACC","SEC",
                                             "ACC",
                                             "ACC","ACC","ACC","ACC","ACC","ACC","ACC","ACC","ACC","ACC","ACC","ACC","PAC-12",
                                             "PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","MAC",
                                             "MAC", "MAC","MAC","MAC","MAC","MAC","MAC","MAC","MAC","MAC","MAC","MAC","ACC",
                                             "ACC","ACC","ACC","ACC","ACC","ACC","ACC","ACC","PAC-12",
                                             "PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","ACC",
                                             "ACC","SEC",
                                             "Big 10",
                                             "Big 10","SEC",
                                             "SEC", "PAC-12",
                                             "PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","PAC-12","Big 12","Big 12","Big 12","Big 12",
                                             "SEC", "SEC", "SEC","SEC",
                                             "SEC","SEC","SEC","SEC","Big 10","Big 10","Big 10","Big 10",
                                             "Big 10","Big 10","Big 10","Big 10","Big 10","SEC","SEC","SEC","SEC",
                                             "SEC", "Midwest Conference","Midwest Conference","Midwest Conference","ACC",
                                             "ACC", "ACC","ACC",
                                             "ACC","ACC","ACC","ACC","ACC","Big 10","Big 10","Big 10","Big 10",
                                             "Big 10","Big 10","CUSA","CUSA","CUSA","CUSA",
                                             "CUSA","CUSA","CUSA","Big 12","Big 12","Big 12","Big 12",
                                             "Big 12","Big 12","Missouri Valley Conference","Big 10","Big 10","Big 10",
                                             "Big 10",
                                             "Big 10","ACC", "ACC", "ACC", "ACC", "ACC", "ACC", "ACC", "ACC"
                                             ))

#tester <- qb_comb %>% select(Name, College, Conference)

qb_logs <- qb_logs %>% 
  filter(Season != "Preseason", Outcome != "T", `Games Started` == 1) %>% 
  select(-c(Position, Week, `Game Date`, Score, `Games Played`, `Games Started`, Name, `Player Id`, Year, Season, Opponent, `Home or Away`))

qb_logs$Outcome <- as.factor(qb_logs$Outcome) 

qb_logs <- qb_logs %>% 
  mutate(Outcome = relevel(Outcome, ref='L')) #set reference level to be L

qb_logs[is.na(qb_logs)] = 0  #replaces NA values with 0

Regression

# creation of cv folds
qbcomb_cv <- vfold_cv(qb_comb, v = 10)

LASSO (No Splines)

#Model spec. Fit & tune LASSO model
lm_lasso_spec_tune <- 
  linear_reg() %>%
  set_args(mixture = 1, penalty = tune()) %>% ## tune() indicates that we will try a variety of values
  set_engine(engine = 'glmnet') %>%
  set_mode('regression') 


#LASSO recipe
lasso_rec <- recipe(Rating ~ Height + Age + Weight + Experience + Conference + hsState + `Passes Attempted` + `Games Played`, data = qb_comb) %>%
    update_role(`Passes Attempted` , new_role = 'Info') %>%
    update_role(`Games Played` , new_role = 'Info') %>%
    step_filter(`Passes Attempted` >= 50, `Games Played` > 0) %>% #removes potential outliers with few passes
    step_nzv(all_predictors()) %>% # removes variables with the same value
    step_novel(all_nominal_predictors()) %>% # important if you have rare categorical variables 
    step_normalize(all_numeric_predictors()) %>%  # important standardization step for LASSO
    step_dummy(all_nominal_predictors())  # creates indicator variables for categorical variables

#LASSO workflow
lasso_wf <- workflow() %>% 
  add_recipe(lasso_rec) %>%
  add_model(lm_lasso_spec_tune) 

penalty_grid <- grid_regular(
  penalty(range = c(-5, 3)), #log10 transformed 10^-5 to 10^3
  levels = 50)

tune_res <- tune_grid( # new function for tuning hyperparameters
  lasso_wf, # workflow
  resamples = qbcomb_cv, # folds
  metrics = metric_set(rmse),
  grid = penalty_grid) # penalty grid

autoplot(tune_res)

collect_metrics(tune_res) %>%
  filter(.metric == 'rmse') %>%
  select(penalty, rmse = mean) 
## # A tibble: 50 × 2
##      penalty  rmse
##        <dbl> <dbl>
##  1 0.00001    37.6
##  2 0.0000146  37.6
##  3 0.0000212  37.6
##  4 0.0000309  37.6
##  5 0.0000450  37.6
##  6 0.0000655  37.6
##  7 0.0000954  37.6
##  8 0.000139   37.6
##  9 0.000202   37.6
## 10 0.000295   37.6
## # … with 40 more rows
best_penalty <- select_best(tune_res, metric = 'rmse') # choose best penalty value

final_wf <- finalize_workflow(lasso_wf, best_penalty) # incorporates penalty value to workflow

final_fit0 <- fit(final_wf, data = qb_comb)

output <- tidy(final_fit0) 

output %>% arrange(desc(estimate)) #sort variables in the LASSO w/o splines model by coefficient value (a way of measuring variable importance)
## # A tibble: 47 × 3
##    term                              estimate penalty
##    <chr>                                <dbl>   <dbl>
##  1 (Intercept)                          89.3  0.00001
##  2 Conference_Ohio.Valley.Conference    52.7  0.00001
##  3 Conference_MAC                       16.2  0.00001
##  4 hsState_MS                           10.8  0.00001
##  5 Conference_Mountain.West              9.29 0.00001
##  6 Experience                            8.52 0.00001
##  7 Conference_Big.10                     5.38 0.00001
##  8 Conference_ACC                        4.76 0.00001
##  9 hsState_HI                            4.55 0.00001
## 10 Conference_PAC.12                     2.18 0.00001
## # … with 37 more rows

LASO (No Splines) CV Metrics

#  calculate/collect CV metrics
mod2_cv <- fit_resamples(final_fit0,
  resamples = qbcomb_cv, 
  metrics = metric_set(rmse, rsq, mae))

mod2_cv %>% collect_metrics()
## # A tibble: 3 × 6
##   .metric .estimator   mean     n std_err .config             
##   <chr>   <chr>       <dbl> <int>   <dbl> <chr>               
## 1 mae     standard   25.6      10  1.12   Preprocessor1_Model1
## 2 rmse    standard   37.6      10  1.30   Preprocessor1_Model1
## 3 rsq     standard    0.143    10  0.0350 Preprocessor1_Model1
#LASSO w/o splines is a far less accurate predictor of QB rating than the LASSO w/ splines model

LASSO w/ Splines

# recipes & workflows

#LASSO recipe
lasso_rec <- recipe(Rating ~ Height + Age + Weight + Experience + Conference + hsState + `Passes Attempted` + `Games Played`, data = qb_comb) %>%
    update_role(`Passes Attempted` , new_role = 'Info') %>%
    update_role(`Games Played` , new_role = 'Info') %>%
    step_filter(`Passes Attempted` >= 50, `Games Played` > 0) %>% #removes potential outliers with few passes
    step_nzv(all_predictors()) %>% # removes variables with the same value
    step_novel(all_nominal_predictors()) %>% # important if you have rare categorical variables 
    step_normalize(all_numeric_predictors()) %>%  # important standardization step for LASSO
    step_dummy(all_nominal_predictors())  # creates indicator variables for categorical variables

#Adding natural cubic splines to LASSO recipe    
ns3_lasso <- lasso_rec %>%
  step_ns(Weight, deg_free = 3) %>% # natural cubic splines (higher deg_free means more knots)
  step_ns(Height, deg_free = 3) %>% 
  step_ns(Age, deg_free = 3) %>% 
  step_ns(Experience, deg_free = 3) %>% 
  step_ns(`Games Played`, deg_free = 2) 

#LASSO workflow
lasso_wf <- workflow() %>% 
  add_recipe(ns3_lasso) %>%
  add_model(lm_lasso_spec_tune)
# fit & tune LASSO model
lm_lasso_spec_tune <- 
  linear_reg() %>%
  set_args(mixture = 1, penalty = tune()) %>% ## tune() indicates that we will try a variety of values
  set_engine(engine = 'glmnet') %>%
  set_mode('regression') 

lasso_wf <- workflow() %>% 
  add_recipe(ns3_lasso) %>%
  add_model(lm_lasso_spec_tune) 

penalty_grid <- grid_regular(
  penalty(range = c(-5, 3)), #log10 transformed 10^-5 to 10^3
  levels = 50)

tune_res <- tune_grid( # new function for tuning hyperparameters
  lasso_wf, # workflow
  resamples = qbcomb_cv, # folds
  metrics = metric_set(rmse),
  grid = penalty_grid # penalty grid
)

autoplot(tune_res)

collect_metrics(tune_res) %>%
  filter(.metric == 'rmse') %>%
  select(penalty, rmse = mean) 
## # A tibble: 50 × 2
##      penalty  rmse
##        <dbl> <dbl>
##  1 0.00001    35.8
##  2 0.0000146  35.8
##  3 0.0000212  35.8
##  4 0.0000309  35.8
##  5 0.0000450  35.8
##  6 0.0000655  35.8
##  7 0.0000954  35.8
##  8 0.000139   35.8
##  9 0.000202   35.8
## 10 0.000295   35.8
## # … with 40 more rows
best_penalty <- select_best(tune_res, metric = 'rmse') # choose best penalty value

final_wf <- finalize_workflow(lasso_wf, best_penalty) # incorporates penalty value to workflow

final_fit <- fit(final_wf, data = qb_comb)

output <- tidy(final_fit) 

output %>% arrange(desc(estimate)) #sort variables in the LASSO w/ splines model by coefficient value (a way of measuring variable importance)
## # A tibble: 57 × 3
##    term                              estimate penalty
##    <chr>                                <dbl>   <dbl>
##  1 (Intercept)                          80.1    0.256
##  2 Conference_Ohio.Valley.Conference    29.4    0.256
##  3 Games Played_ns_1                    16.6    0.256
##  4 Games Played_ns_2                     9.75   0.256
##  5 Experience_ns_3                       7.02   0.256
##  6 hsState_HI                            5.47   0.256
##  7 Conference_MAC                        4.12   0.256
##  8 hsState_MS                            3.11   0.256
##  9 Conference_ACC                        2.98   0.256
## 10 hsState_OH                            2.79   0.256
## # … with 47 more rows

LASSO with Splines CV Metrics

#  calculate/collect CV metrics
mod1_cv <- fit_resamples(final_fit,
  resamples = qbcomb_cv, 
  metrics = metric_set(rmse, rsq, mae))

mod1_cv %>% collect_metrics()
## # A tibble: 3 × 6
##   .metric .estimator   mean     n std_err .config             
##   <chr>   <chr>       <dbl> <int>   <dbl> <chr>               
## 1 mae     standard   24.3      10  0.944  Preprocessor1_Model1
## 2 rmse    standard   35.5      10  1.08   Preprocessor1_Model1
## 3 rsq     standard    0.284    10  0.0341 Preprocessor1_Model1
#LASSO w/ Splines is a far better model for predicting QB passer rating than the ordinary LASSO model.

OLS Residual Evaluation

# visual residuals
mod1_output <- qb_comb %>%
    bind_cols(predict(final_fit, new_data = qb_comb)) %>%
    mutate(resid = Rating - .pred) %>% 
    filter(`Passes Attempted` >= 50, `Games Played` > 0)

#Height (non-linearity)
ggplot(mod1_output, aes(x = Height, y = resid)) +
    geom_point() +
    geom_smooth() +
    geom_hline(yintercept = 0, color = "red") + 
    theme_classic() 

#Age (some non-linearity)
ggplot(mod1_output, aes(x = Age, y = resid)) +
    geom_point() +
    geom_smooth() +
    geom_hline(yintercept = 0, color = "red") + 
    theme_classic() 

#Weight (non-linearity)
ggplot(mod1_output, aes(x = Weight, y = resid)) +
    geom_point() +
    geom_smooth() +
    geom_hline(yintercept = 0, color = "red") + 
    theme_classic() 

#Experience (non-linearity)
ggplot(mod1_output, aes(x = Experience, y = resid)) +
    geom_point() +
    geom_smooth() +
    geom_hline(yintercept = 0, color = "red") + 
    theme_classic() 

#College Conference
ggplot(mod1_output, aes(x = Conference, y = resid)) +
    geom_point() +
    geom_smooth() +
    geom_hline(yintercept = 0, color = "red") + 
    theme_classic() 

#High School State
ggplot(mod1_output, aes(x = hsState, y = resid)) +
    geom_point() +
    geom_smooth() +
    geom_hline(yintercept = 0, color = "red") + 
    theme_classic() 

#Passes Attempted (non-linearity)
ggplot(mod1_output, aes(x = `Passes Attempted`, y = resid)) +
    geom_point() +
    geom_smooth() +
    geom_hline(yintercept = 0, color = "red") + 
    theme_classic() 

#Games Played (non-linearity)
ggplot(mod1_output, aes(x = `Games Played`, y = resid)) +
    geom_point() +
    geom_smooth() +
    geom_hline(yintercept = 0, color = "red") + 
    theme_classic() 

OLS Model (No Splines)

#OLS model spec
lm_spec <- 
  linear_reg() %>%
  set_engine(engine = 'lm') %>%
  set_mode('regression') 
    
#OLS recipe
lm_rec <- recipe(Rating ~ Height + Age + Weight + Experience + Conference + hsState + `Passes Attempted` + `Games Played`, data = qb_comb) %>% 
    update_role(`Passes Attempted`, new_role = 'Info') %>%
    update_role(`Games Played`, new_role = 'Info') %>%
    step_filter(`Passes Attempted` >= 50, `Games Played` > 0) %>% #removes potential outliers with few passes
    #step_lincomb(all_numeric_predictors()) %>% # removes predictors that are linear combos of others
    #step_corr(all_numeric_predictors()) %>% #removes highly correlated variables
    step_other(all_nominal_predictors(),threshold = .05)  %>%
    step_dummy(all_nominal_predictors())  # creates indicator variables for categorical variables

#OLS workflow
lm_wf <- workflow() %>%
  add_recipe(lm_rec) %>%
  add_model(lm_spec)

fit(lm_wf, data = qb_comb) %>% tidy()
## # A tibble: 14 × 5
##    term              estimate std.error statistic   p.value
##    <chr>                <dbl>     <dbl>     <dbl>     <dbl>
##  1 (Intercept)       185.        45.2       4.10  0.0000534
##  2 Height             -1.11       0.694    -1.60  0.111    
##  3 Age                -1.37       0.798    -1.72  0.0864   
##  4 Weight              0.0960     0.108     0.893 0.372    
##  5 Experience          1.62       0.786     2.07  0.0395   
##  6 Conference_Big.10  -4.74       3.09     -1.54  0.126    
##  7 Conference_Big.12  -8.97       3.37     -2.66  0.00824  
##  8 Conference_PAC.12 -11.2        3.23     -3.48  0.000582 
##  9 Conference_SEC     -5.26       2.98     -1.77  0.0780   
## 10 Conference_other   -9.79       2.85     -3.43  0.000678 
## 11 hsState_OH          2.70       4.11      0.657 0.511    
## 12 hsState_PA         -7.11       4.36     -1.63  0.104    
## 13 hsState_TX         -4.00       2.73     -1.46  0.144    
## 14 hsState_other      -6.94       2.65     -2.62  0.00912

OLS Model (No Splines) CV Metrics

#calculate/collect CV metrics
lm_cv <- fit_resamples(lm_wf,
  resamples = qbcomb_cv, 
  metrics = metric_set(rmse, rsq, mae))

lm_cv %>% collect_metrics()
## # A tibble: 3 × 6
##   .metric .estimator    mean     n std_err .config             
##   <chr>   <chr>        <dbl> <int>   <dbl> <chr>               
## 1 mae     standard   26.9       10  0.915  Preprocessor1_Model1
## 2 rmse    standard   39.6       10  1.02   Preprocessor1_Model1
## 3 rsq     standard    0.0507    10  0.0127 Preprocessor1_Model1
#OLS (no splines) is a far worse predictor of QB passer rating than an OLS model incorporating natural cubic splines.

OLS Model w/ Splines

#qb_comb %>% purrr::map(~sum(is.na(.)))

lm_spec <- 
  linear_reg() %>%
  set_engine(engine = 'lm') %>%
  set_mode('regression') 

#OLS recipe
lm_rec <- recipe(Rating ~ Height + Age + Weight + Experience + Conference + hsState + `Passes Attempted` + `Games Played`, data = qb_comb) %>% 
    update_role(`Passes Attempted`, new_role = 'Info') %>%
    update_role(`Games Played`, new_role = 'Info') %>%
    step_filter(`Passes Attempted` >= 50, `Games Played` > 0) %>% #removes potential outliers with few passes
    #step_lincomb(all_numeric_predictors()) %>% # removes predictors that are linear combos of others
    #step_corr(all_numeric_predictors()) %>% #removes highly correlated variables
    step_other(all_nominal_predictors(),threshold = .05)  %>%
    step_dummy(all_nominal_predictors())  # creates indicator variables for categorical variables
   
 
ns3_OLS <- lm_rec %>%
  step_ns(Weight, deg_free = 3) %>% # natural cubic splines (higher deg_free means more knots)
  step_ns(Height, deg_free = 3) %>% 
  step_ns(Age, deg_free = 3) %>% 
  step_ns(Experience, deg_free = 3) %>% 
  step_ns(`Games Played`, deg_free = 2) 
  

#OLS workflow
splines_wf <- workflow() %>%
  add_recipe(ns3_OLS) %>%
  add_model(lm_spec)

fit(splines_wf, data = qb_comb) %>% tidy()
## # A tibble: 24 × 5
##    term              estimate std.error statistic  p.value
##    <chr>                <dbl>     <dbl>     <dbl>    <dbl>
##  1 (Intercept)          83.7      10.1      8.25  4.87e-15
##  2 Conference_Big.10    -7.95      3.37    -2.36  1.89e- 2
##  3 Conference_Big.12    -5.88      3.25    -1.81  7.18e- 2
##  4 Conference_PAC.12    -3.40      3.77    -0.900 3.69e- 1
##  5 Conference_SEC       -1.55      3.18    -0.487 6.26e- 1
##  6 Conference_other     -5.37      2.91    -1.85  6.60e- 2
##  7 hsState_OH            9.04      4.44     2.04  4.23e- 2
##  8 hsState_PA            1.69      4.81     0.352 7.25e- 1
##  9 hsState_TX           -2.51      2.98    -0.843 4.00e- 1
## 10 hsState_other        -2.96      3.15    -0.940 3.48e- 1
## # … with 14 more rows

OLS Model w/ Splines CV Metrics

#calculate/collect CV metrics
splines_cv <- fit_resamples(splines_wf,
  resamples = qbcomb_cv, 
  metrics = metric_set(rmse, rsq, mae))

splines_cv %>% collect_metrics()
## # A tibble: 3 × 6
##   .metric .estimator   mean     n std_err .config             
##   <chr>   <chr>       <dbl> <int>   <dbl> <chr>               
## 1 mae     standard   24.4      10  0.901  Preprocessor1_Model1
## 2 rmse    standard   35.7      10  1.06   Preprocessor1_Model1
## 3 rsq     standard    0.271    10  0.0319 Preprocessor1_Model1

Classification

Logistic Regression Model

logistic_mod <- logistic_reg() %>%
    set_engine("glm") %>%
    set_mode('classification')

outcome_rec <- recipe(Outcome ~ ., data = qb_logs) %>%
  step_normalize(all_numeric_predictors())

logistic_mod_fit <- workflow() %>%
  add_model(logistic_mod) %>%
  add_recipe(outcome_rec) %>%
  fit(data = qb_logs)
logistic_mod_fit %>%
  tidy() # Model Coefficients from Trained Model
## # A tibble: 17 × 5
##    term                        estimate std.error statistic   p.value
##    <chr>                          <dbl>     <dbl>     <dbl>     <dbl>
##  1 (Intercept)                   0.0869    0.0267     3.25  1.15e-  3
##  2 `Passes Completed`            0.418     0.152      2.76  5.81e-  3
##  3 `Passes Attempted`           -1.02      0.116     -8.82  1.18e- 18
##  4 `Completion Percentage`      -0.136     0.0861    -1.58  1.15e-  1
##  5 `Passing Yards`               0.289     0.0999     2.89  3.81e-  3
##  6 `Passing Yards Per Attempt`  -0.0444    0.0672    -0.661 5.08e-  1
##  7 `TD Passes`                   0.303     0.0639     4.75  2.06e-  6
##  8 Ints                         -0.301     0.0660    -4.57  4.98e-  6
##  9 Sacks                        -0.154     0.0607    -2.53  1.14e-  2
## 10 `Sacked Yards Lost`          -0.272     0.0606    -4.48  7.35e-  6
## 11 `Passer Rating`               0.639     0.135      4.75  2.03e-  6
## 12 `Rushing Attempts`            1.12      0.0483    23.1   1.68e-118
## 13 `Rushing Yards`              -0.790     0.0597   -13.2   4.92e- 40
## 14 `Yards Per Carry`            -0.0491    0.0415    -1.18  2.37e-  1
## 15 `Rushing TDs`                 0.0741    0.0291     2.54  1.10e-  2
## 16 Fumbles                      -0.227     0.0365    -6.23  4.52e- 10
## 17 `Fumbles Lost`               -0.164     0.0359    -4.59  4.53e-  6
library(dotwhisker)
tidy(logistic_mod_fit) %>%  # Viz of Trained Model Odds Ratios
  mutate(conf.low = exp(estimate - 1.96*std.error),conf.high = exp(estimate + 1.96*std.error)) %>% # do this first
  mutate(estimate = exp(estimate)) %>% # this second
  dwplot(dot_args = list(size = 2, color = "black"),
         whisker_args = list(color = "black"),
         vline = geom_vline(xintercept = 1, color = "grey50", linetype = 2)) + 
  labs(x = 'Odds Ratio') + 
  theme_classic()

Soft Predictions

# Soft Predictions: Predicted Probabilities
logistic_output <-  qb_logs %>%
  bind_cols(predict(logistic_mod_fit, new_data = qb_logs, type = 'prob')) 

logistic_output %>% head()
## # A tibble: 6 × 19
##   Outcome `Passes Completed` `Passes Attempte… `Completion Perc… `Passing Yards`
##   <fct>                <dbl>             <dbl>             <dbl>           <dbl>
## 1 W                       18                29              62.1             176
## 2 L                        5                 8              62.5              25
## 3 L                       11                28              39.3             154
## 4 L                       19                36              52.8             230
## 5 W                       13                22              59.1             142
## 6 L                        4                13              30.8              67
## # … with 14 more variables: Passing Yards Per Attempt <dbl>, TD Passes <dbl>,
## #   Ints <dbl>, Sacks <dbl>, Sacked Yards Lost <dbl>, Passer Rating <dbl>,
## #   Rushing Attempts <dbl>, Rushing Yards <dbl>, Yards Per Carry <dbl>,
## #   Rushing TDs <dbl>, Fumbles <dbl>, Fumbles Lost <dbl>, .pred_L <dbl>,
## #   .pred_W <dbl>
# Visualize predicted probabilities as a function of true outcome
logistic_output %>%
  ggplot(aes(x = Outcome, y = .pred_W)) +
  geom_boxplot() + 
  geom_hline(yintercept = 0.5, color='red') + 
  labs(y = 'Predicted Probability of Winning', x = 'Observed Outcome (L or W)') +
  theme_classic()

Hard Predictions

logistic_output %>%
  ggplot(aes(x = Outcome, y = .pred_L)) +
  geom_boxplot() + 
  geom_hline(yintercept = 0.5, color='red') + 
  labs(y = 'Predicted Probability of LOSING', x = 'Observed Outcome (L or W)') +
  theme_classic()

logistic_output <- logistic_output %>%
  mutate(.pred_class = make_two_class_pred(.pred_L, levels(Outcome), threshold = .5)) #77.1 accuracy

#head(logistic_output)

#logistic_output %>%
#  count(Outcome, .pred_class)

logistic_output %>%
  conf_mat(truth = Outcome, estimate = .pred_class)
##           Truth
## Prediction    L    W
##          L 3428 1038
##          W 1072 3691

CV Accuracy Metrics

set.seed(1512)
qblogs_cv10 <- vfold_cv(qb_logs, v=10)

# CV Fit Model
log_cv_fit <- fit_resamples(
    logistic_mod_fit, 
    resamples = qblogs_cv10,
    metrics = metric_set(sens, yardstick::spec, accuracy, roc_auc),
    control = control_resamples(save_pred = TRUE, event_level = 'second'))  # you need predictions for ROC calculations

collect_metrics(log_cv_fit) #default threshold is 0.5
## # A tibble: 4 × 6
##   .metric  .estimator  mean     n std_err .config             
##   <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
## 1 accuracy binary     0.771    10 0.00420 Preprocessor1_Model1
## 2 roc_auc  binary     0.850    10 0.00483 Preprocessor1_Model1
## 3 sens     binary     0.782    10 0.00630 Preprocessor1_Model1
## 4 spec     binary     0.760    10 0.00428 Preprocessor1_Model1

Random Forest Model

outcome_rf_rec <- recipe(Outcome ~ ., data = qb_logs) %>%
  step_normalize(all_numeric_predictors())

rf_spec <- rand_forest() %>%
  set_engine(engine = 'ranger') %>% 
  set_args(mtry = NULL, # size of random subset of variables; default is floor(sqrt(ncol(x)))
           trees = 100, # Number of bags
           min_n = 5,
           probability = FALSE, # want hard predictions first
           importance = 'impurity') %>% 
  set_mode('classification') # change this for regression tree

rf_spec
## Random Forest Model Specification (classification)
## 
## Main Arguments:
##   trees = 100
##   min_n = 5
## 
## Engine-Specific Arguments:
##   probability = FALSE
##   importance = impurity
## 
## Computational engine: ranger
outcome_rf_wf <- workflow() %>%
  add_model(rf_spec) %>%
  add_recipe(outcome_rf_rec)
set.seed(123)
outcome_rf_fit <- outcome_rf_wf %>%
  fit(data = qb_logs)

outcome_rf_fit # check out OOB prediction error (accuracy = 1 - OOB prediction error)
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 1 Recipe Step
## 
## • step_normalize()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Ranger result
## 
## Call:
##  ranger::ranger(x = maybe_data_frame(x), y = y, num.trees = ~100,      min.node.size = min_rows(~5, x), probability = ~FALSE, importance = ~"impurity",      num.threads = 1, verbose = FALSE, seed = sample.int(10^5,          1)) 
## 
## Type:                             Classification 
## Number of trees:                  100 
## Sample size:                      9229 
## Number of independent variables:  16 
## Mtry:                             4 
## Target node size:                 5 
## Variable importance mode:         impurity 
## Splitrule:                        gini 
## OOB prediction error:             22.74 %

OOB Error

outcome_rf_OOB_output <- tibble(
  .pred_class = outcome_rf_fit %>% extract_fit_engine() %>% pluck('predictions'),
  Outcome = qb_logs %>% pull(Outcome))

bag_metrics <- metric_set(sens, yardstick::spec, accuracy)

outcome_rf_OOB_output %>% 
  bag_metrics(truth = Outcome, estimate = .pred_class)
## # A tibble: 3 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 sens     binary         0.779
## 2 spec     binary         0.767
## 3 accuracy binary         0.773

AUC

set.seed(123) #to get the same bootstrap samples, use same seed
outcome_rf_fit2 <- outcome_rf_wf %>%
  update_model(rf_spec %>% set_args(probability = TRUE)) %>%
  fit(data = qb_logs)

outcome_rf_fit2
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 1 Recipe Step
## 
## • step_normalize()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Ranger result
## 
## Call:
##  ranger::ranger(x = maybe_data_frame(x), y = y, num.trees = ~100,      min.node.size = min_rows(~5, x), probability = ~TRUE, importance = ~"impurity",      num.threads = 1, verbose = FALSE, seed = sample.int(10^5,          1)) 
## 
## Type:                             Probability estimation 
## Number of trees:                  100 
## Sample size:                      9229 
## Number of independent variables:  16 
## Mtry:                             4 
## Target node size:                 5 
## Variable importance mode:         impurity 
## Splitrule:                        gini 
## OOB prediction error (Brier s.):  0.1569944
outcome_rf_OOB_output2 <- bind_cols(
  outcome_rf_fit2 %>% extract_fit_engine() %>% pluck('predictions') %>% as_tibble(),
  qb_logs %>% select(Outcome))

outcome_rf_OOB_output2 %>% 
  roc_curve(Outcome, W, event_level = "second") %>% autoplot()

outcome_rf_OOB_output2 %>% 
  roc_auc(Outcome, W, event_level = "second") #Area under Curve
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc binary         0.851

Variable Importance

library(vip) #install.packages('vip')

outcome_rf_fit %>% extract_fit_engine() %>% vip() #based on impurity

outcome_rf_wf %>% #based on permutation
  update_model(rf_spec %>% set_args(importance = "permutation")) %>%
  fit(data = qb_logs) %>% extract_fit_engine() %>% vip()

Model Comparison

collect_metrics(log_cv_fit) #CV Logistic Model
## # A tibble: 4 × 6
##   .metric  .estimator  mean     n std_err .config             
##   <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
## 1 accuracy binary     0.771    10 0.00420 Preprocessor1_Model1
## 2 roc_auc  binary     0.850    10 0.00483 Preprocessor1_Model1
## 3 sens     binary     0.782    10 0.00630 Preprocessor1_Model1
## 4 spec     binary     0.760    10 0.00428 Preprocessor1_Model1
outcome_rf_OOB_output %>%  #RF OOB Accuracy
  bag_metrics(truth = Outcome, estimate = .pred_class)
## # A tibble: 3 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 sens     binary         0.779
## 2 spec     binary         0.767
## 3 accuracy binary         0.773
outcome_rf_OOB_output2 %>%  #RF AUC
  roc_auc(Outcome, W, event_level = "second")
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc binary         0.851

Clustering

Dendrograms

# Select the variables to be used in clustering
qb_logs_sub <- qb_logs %>%
    select(-Outcome)

# Summary statistics for the variables
summary(qb_logs_sub)
##  Passes Completed Passes Attempted Completion Percentage Passing Yards  
##  Min.   : 0.00    Min.   : 0.00    Min.   :  0.00        Min.   :  0.0  
##  1st Qu.:15.00    1st Qu.:25.00    1st Qu.: 53.70        1st Qu.:168.0  
##  Median :19.00    Median :32.00    Median : 60.70        Median :221.0  
##  Mean   :19.14    Mean   :31.57    Mean   : 60.38        Mean   :222.1  
##  3rd Qu.:23.00    3rd Qu.:38.00    3rd Qu.: 67.60        3rd Qu.:277.0  
##  Max.   :43.00    Max.   :69.00    Max.   :100.00        Max.   :527.0  
##  Passing Yards Per Attempt   TD Passes          Ints            Sacks       
##  Min.   : 0.000            Min.   :0.000   Min.   :0.0000   Min.   : 0.000  
##  1st Qu.: 5.700            1st Qu.:0.000   1st Qu.:0.0000   1st Qu.: 1.000  
##  Median : 6.900            Median :1.000   Median :1.0000   Median : 2.000  
##  Mean   : 7.082            Mean   :1.349   Mean   :0.9157   Mean   : 2.094  
##  3rd Qu.: 8.200            3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.: 3.000  
##  Max.   :66.000            Max.   :7.000   Max.   :7.0000   Max.   :11.000  
##  Sacked Yards Lost Passer Rating    Rushing Attempts Rushing Yards   
##  Min.   : 0.00     Min.   :  0.00   Min.   : 0.000   Min.   :-31.00  
##  1st Qu.: 4.00     1st Qu.: 64.80   1st Qu.: 1.000   1st Qu.:  0.00  
##  Median :11.00     Median : 84.30   Median : 2.000   Median :  5.00  
##  Mean   :13.51     Mean   : 84.13   Mean   : 2.622   Mean   : 10.09  
##  3rd Qu.:20.00     3rd Qu.:103.60   3rd Qu.: 4.000   3rd Qu.: 15.00  
##  Max.   :91.00     Max.   :158.30   Max.   :22.000   Max.   :181.00  
##  Yards Per Carry    Rushing TDs         Fumbles        Fumbles Lost   
##  Min.   :-26.000   Min.   :0.00000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:  0.000   1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :  2.000   Median :0.00000   Median :0.0000   Median :0.0000  
##  Mean   :  2.969   Mean   :0.09459   Mean   :0.5383   Mean   :0.2234  
##  3rd Qu.:  5.000   3rd Qu.:0.00000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   : 44.000   Max.   :3.00000   Max.   :6.0000   Max.   :3.0000
# Compute a distance matrix on the scaled data
dist_mat_scaled <- dist(scale(qb_logs_sub))

# The (scaled) distance matrix is the input to hclust()
# The method argument indicates the linkage type
hc_complete <- hclust(dist_mat_scaled, method = "complete")
hc_single <- hclust(dist_mat_scaled, method = "single")
hc_average <- hclust(dist_mat_scaled, method = "average")
hc_centroid <- hclust(dist_mat_scaled, method = "centroid")

# Plot dendrograms
plot(hc_complete)

plot(hc_single)

plot(hc_average)

plot(hc_centroid)

qb_logs <- qb_logs %>%
    mutate(
        hclust_height3 = factor(cutree(hc_complete, h = 3)), # Cut at height (h) 3
        hclust_num6 = factor(cutree(hc_complete, k = 4)) # Cut into 4 clusters (k)
    )

plot(hc_complete, labels = qb_logs$hclust_height3)

plot(hc_complete, labels = qb_logs$hclust_num6)

PCA

mat <- qb_logs %>%
  select(-Outcome,-hclust_height3,-hclust_num6) %>%
  as.matrix()

pca_out <- prcomp(mat, center = TRUE, scale = TRUE)
pca_out %>% pluck('rotation') %>% head()
##                                 PC1         PC2         PC3         PC4
## Passes Completed          0.3716850 -0.27288303  0.18577370 -0.23266956
## Passes Attempted          0.2368035 -0.35424044  0.24246876 -0.34715781
## Completion Percentage     0.3777490  0.06715226 -0.07504039  0.16610358
## Passing Yards             0.4304861 -0.21129937  0.11267792 -0.13944330
## Passing Yards Per Attempt 0.3592990  0.10863208 -0.13577730  0.22293786
## TD Passes                 0.3696420  0.03053012 -0.02378479  0.01901386
##                                    PC5          PC6         PC7         PC8
## Passes Completed           0.013526852  0.044247834 -0.24386898  0.05570391
## Passes Attempted           0.041116903 -0.006248729 -0.27398074 -0.02432124
## Completion Percentage     -0.058537147  0.118479501  0.06141751  0.19643588
## Passing Yards              0.019214760  0.040710125  0.05196286 -0.02370818
## Passing Yards Per Attempt -0.029852332  0.085558698  0.48760486  0.03584002
## TD Passes                  0.008185073 -0.266913398  0.18212157 -0.24531741
##                                   PC9       PC10          PC11         PC12
## Passes Completed          -0.19037221 -0.1266949  0.0002363081  0.003278444
## Passes Attempted           0.05026027  0.1164014  0.0020067820  0.009255326
## Completion Percentage     -0.59303574 -0.4860531  0.0073397576  0.009831694
## Passing Yards              0.06211819  0.4021595 -0.0045303117 -0.001038738
## Passing Yards Per Attempt -0.07152699  0.5435094  0.0004705980  0.015312213
## TD Passes                  0.62608881 -0.4412705  0.0033561191 -0.013089873
##                                   PC13         PC14        PC15        PC16
## Passes Completed          -0.006028635 -0.006319295  0.19478709  0.73736015
## Passes Attempted           0.005755979 -0.571474156 -0.15821416 -0.44037360
## Completion Percentage     -0.013447817  0.030583562  0.17017476 -0.37627986
## Passing Yards              0.000403107  0.708455495 -0.02319586 -0.25405675
## Passing Yards Per Attempt  0.008272566 -0.408798164  0.24569647  0.12412746
## TD Passes                  0.008883324 -0.032655275  0.32278200 -0.06799853
pca_out_plot <- pca_out %>% pluck('x') %>% as.data.frame() %>% select(PC1,PC2) %>% bind_cols(
  
  tibble(cluster = qb_logs$hclust_num6, Outcome = qb_logs$Outcome)
)
pca_out %>% head()

Loadings

pca_out$rotation %>% as.data.frame() %>% select(PC1) %>%
    arrange(desc(abs(PC1)))
##                                   PC1
## Passing Yards              0.43048614
## Passer Rating              0.42219398
## Completion Percentage      0.37774897
## Passes Completed           0.37168501
## TD Passes                  0.36964203
## Passing Yards Per Attempt  0.35929901
## Passes Attempted           0.23680352
## Ints                      -0.11354133
## Sacks                     -0.08666869
## Sacked Yards Lost         -0.08394013
## Fumbles                   -0.05532546
## Fumbles Lost              -0.03497970
## Yards Per Carry           -0.02796717
## Rushing TDs                0.02396848
## Rushing Yards             -0.02123042
## Rushing Attempts           0.01555846
pca_out$rotation %>% as.data.frame() %>% select(PC2) %>%
    arrange(desc(abs(PC2)))
##                                   PC2
## Sacks                     -0.43922376
## Sacked Yards Lost         -0.42991899
## Passes Attempted          -0.35424044
## Fumbles                   -0.33962177
## Fumbles Lost              -0.30736245
## Passes Completed          -0.27288303
## Passing Yards             -0.21129937
## Ints                      -0.21103781
## Rushing Yards             -0.18281352
## Yards Per Carry           -0.17824523
## Passer Rating              0.15852117
## Rushing Attempts          -0.11793770
## Passing Yards Per Attempt  0.10863208
## Completion Percentage      0.06715226
## Rushing TDs               -0.04276507
## TD Passes                  0.03053012

Plots

pca_out %>% 
    pluck('x') %>%
    as.data.frame() %>%
    mutate(labels = qb_logs$Outcome) %>%
    ggplot(aes(x = PC1, y = PC2, color = factor(labels))) + 
    geom_point() +
    labs(x = 'PC1', y = 'PC2') +
    scale_color_viridis_d() +
    theme_classic()

pca_out %>% 
    pluck('x') %>%
    as.data.frame() %>%
    mutate(labels = qb_logs$hclust_num6) %>%
    ggplot(aes(x = PC1, y = PC2, color = factor(labels))) + 
    geom_point() +
    labs(x = 'PC1', y = 'PC2') +
    scale_color_viridis_d() +
    theme_classic()

var_explained <- (pca_out %>% pluck('sdev'))^2
pve <- var_explained/sum(var_explained)

var_data <- tibble(
    PC = seq_len(length(var_explained)),
    var_explained = var_explained,
    pve = pve
)
    
# Construct scree plots
p1 <- var_data %>%
    ggplot(aes(x = PC, y = pve)) +
    geom_point() + 
    geom_line() + 
    labs(x = 'Principal Component', y = 'Proportion of varinace explained') +
    theme_classic()

p2 <- var_data %>%
    ggplot(aes(x = PC, y = cumsum(pve))) +
    geom_point() + 
    geom_line() + 
    labs(x = 'Principal Component', y = 'Cumulative proportion of variance explained') +
    theme_classic()

library(ggpubr) 
ggarrange(p1, p2)